I have a typical project of predicting the NYC uber/lyft trip demand. The dataset is available from Jan2022 to March 2023. The area is already divided into different locations. and I want the predicted demand for each location every 15 mins
The goal of this project is to predict the demand for Uber/Lyft trips in different locations of NYC every 15 minutes, using a dataset spanning from January 2022 to March 2023. The dataset includes information such as the dispatching base number, pickup datetime, drop-off datetime, pickup location ID, drop-off location ID, SR_Flag, and affiliated base number
import pandas as pd
import glob
import tqdm
import pandas as pd
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from dateutil.relativedelta import relativedelta
import numpy as np
from pmdarima import auto_arima
data_list_path = glob.glob('Datasets/fhv_tripdata_2022-2023_in_csv/*.csv')
list_df = []
for path in data_list_path:
print(path)
# Step 1: Preprocess the Dataset
df = pd.read_csv(path)
list_df.append(df)
df = pd.concat(list_df)
interested_features = ['pickup_datetime','PUlocationID']
df = df[interested_features]
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-09.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-02.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-04.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-07.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-01.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-06.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-08.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-03.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-11.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-12.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-02.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-03.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-01.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-05.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-10.csv
import pandas as pd
import pmdarima as pm
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
print('Number of Rows Before Removing NaN:', df.shape[0])
removed_nan_df = df.dropna()
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])
Number of Rows Before Removing NaN: 17712727 Number of Rows After Removing NaN: 4164902
import pandas as pd
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from prophet import Prophet
print('Number of Rows Before Removing NaN:', df.shape[0])
removed_nan_df = df.dropna()
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])
location_ids = removed_nan_df['PUlocationID'].unique().tolist()
loop_count = 0
for lc_id in location_ids:
print('Location ID:', lc_id)
df_subset = removed_nan_df[removed_nan_df['PUlocationID'] == lc_id]
df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime'])
df_subset = df_subset.sort_values('pickup_datetime')
df_subset = df_subset.set_index('pickup_datetime')
df_subset = df_subset['PUlocationID'].resample('1H').count()
df_subset = df_subset.reset_index()
# Split data into training and testing sets
train_size = int(len(df_subset) * 0.95)
train_data = df_subset[:train_size]
test_data = df_subset[train_size:]
# Prepare data for Prophet model
prophet_train_data = train_data.rename(columns={'pickup_datetime': 'ds', 'PUlocationID': 'y'})
# Create and fit the Prophet model
model = Prophet(
seasonality_mode='additive',
daily_seasonality=True, # Disable daily seasonality
weekly_seasonality=True, # Enable weekly seasonality
yearly_seasonality=False, # Disable yearly seasonality
)
model.fit(prophet_train_data)
# Generate future dates for prediction
future_dates = model.make_future_dataframe(periods=len(test_data), freq='H')
# Make predictions
forecast = model.predict(future_dates)
forecast = forecast[['ds', 'yhat']][-len(test_data):]
# Plotting
fig = go.Figure()
fig.add_trace(go.Scatter(x=prophet_train_data['ds'], y=prophet_train_data['y'], mode='lines+markers', name='Training Data'))
fig.add_trace(go.Scatter(x=test_data['pickup_datetime'], y=test_data['PUlocationID'], mode='lines+markers', name='Testing Data'))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='lines+markers', name='Prophet Forecast'))
fig.update_layout(title=f'PickLocation ID: {lc_id} - Facebook Prophet', xaxis_title='Time', yaxis_title='Number Drives')
fig.show()
loop_count += 1
if loop_count > 5:
break
/home/iffi/anaconda3/envs/sep_darts_2/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Number of Rows Before Removing NaN: 17712727 Number of Rows After Removing NaN: 4164902 Location ID: 12.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime']) 12:04:51 - cmdstanpy - INFO - Chain [1] start processing 12:04:53 - cmdstanpy - INFO - Chain [1] done processing
Location ID: 89.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy 12:04:54 - cmdstanpy - INFO - Chain [1] start processing 12:04:55 - cmdstanpy - INFO - Chain [1] done processing
Location ID: 87.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy 12:04:57 - cmdstanpy - INFO - Chain [1] start processing 12:04:58 - cmdstanpy - INFO - Chain [1] done processing
Location ID: 230.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy 12:05:00 - cmdstanpy - INFO - Chain [1] start processing 12:05:01 - cmdstanpy - INFO - Chain [1] done processing
Location ID: 73.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy 12:05:03 - cmdstanpy - INFO - Chain [1] start processing 12:05:04 - cmdstanpy - INFO - Chain [1] done processing
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy 12:05:05 - cmdstanpy - INFO - Chain [1] start processing
Location ID: 93.0
12:05:07 - cmdstanpy - INFO - Chain [1] done processing
# df_subset.values
# df_subset
# df = df_subset
# df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# df = df.set_index('pickup_datetime')
# df['pickups_per_hour'] = df['PUlocationID'].resample('3H').count()
# df
df
| pickup_datetime | PUlocationID | |
|---|---|---|
| 0 | 2022-09-01 00:34:00 | NaN |
| 1 | 2022-09-01 00:10:00 | NaN |
| 2 | 2022-09-01 00:58:35 | NaN |
| 3 | 2022-09-01 00:50:00 | NaN |
| 4 | 2022-09-01 00:45:00 | NaN |
| ... | ... | ... |
| 1174983 | 2022-10-31 23:30:36 | NaN |
| 1174984 | 2022-10-31 23:15:13 | NaN |
| 1174985 | 2022-10-31 23:41:39 | NaN |
| 1174986 | 2022-10-31 23:15:23 | NaN |
| 1174987 | 2022-10-31 23:33:06 | NaN |
17712727 rows × 2 columns
# import pandas as pd
# import matplotlib.pyplot as plt
# # Step 1: Preprocess the Dataset
# df = pd.read_csv('Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-01.csv')
# df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# df['dropOff_datetime'] = pd.to_datetime(df['dropOff_datetime'])
# df.set_index('pickup_datetime', inplace=True)
# # Step 2: Resample the Dataset
# demand_15_mints = df[['PUlocationID', 'DOlocationID']].resample('15T').size()
# demand_30_mints = df[['PUlocationID', 'DOlocationID']].resample('30T').size()
# demand_60_mints = df[['PUlocationID', 'DOlocationID']].resample('1h').size()
# # Step 3: Predict the Demand (using your preferred model)
# # Step 4: Visualize the Demand
# demand_15_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()
# # Step 4: Visualize the Demand
# demand_30_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()
# # Step 4: Visualize the Demand
# demand_60_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()
# demand_30_mints